/*
This file caches the lasso dataset for the Alpha paper.

This runs for Current symptoms (c) and Past symptoms (p) for the four column conditions:
  1) just symptoms
  2) symptoms and non-symptom characteristics
  3) 3-way interactions of all symptoms
  4) 3-way interactions of all symptoms and non-symptom characteristics

1&2 are simple, 3&4 explode exponentially, which for #4 is 15 symptoms and 30 non symptoms, or roughly C(45,3)=14,190 combinations to test.
That's a lot of regressions to run, so we cut down on the number by removing all colinear combinations of these variables first and cache them (2.4_set_bootstrap_w_scalars.do).
We then run the lasso bootstrapping routine to get correct SEs, and cache those files (in ./output/bootstrap/).

Run from other file, thus no CDs or global var defs.
*/

global writeout 0 // change to 1 for writing out

set maxvar 120000
set seed 1337

capture cd "~/Desktop/RESTAT_CODE"

global dir_root = "."
global dir_code = "${dir_root}/code"
global dir_data = "${dir_root}/data"
global dir_output "${dir_root}/output"

global output_base "${dir_output}/bootstrap/boostrap_results_"

global B = 1000
scalar cdf_h = .1
scalar a_n = 0.05

// Load Data
do "${dir_code}/1.1_cache-load_lasso_data.do"
// To fit naming convention below, change cia_* to something else
foreach v of varlist cia_* pcr_* pct_* {
    rename `v' test_`v'
}


/* // Uncomment to run for in-person bootstrap subsample
global output_base "${output_base}knock_"
keep if phase1_inperson == 1 */


/* // Uncomment to run for letter bootstrap subsample
global output_base "${output_base}letters_"
keep if phase2 == 1 */


** Make W matrices first time
foreach cp in "c" "p" {
    scalar w_`cp'_1 = "`cp'sym*"
    scalar w_`cp'_2 = "`=w_`cp'_1' nonvar*"
    scalar w_`cp'_3 = "`=w_`cp'_1' `cp'`cp'_* `cp'`cp'`cp'_*"
    scalar w_`cp'_4 = "`=w_`cp'_2' `cp'?_* `cp'??_* nnn_*"

    foreach cpi of numlist 1/4 {
        local tmp "`=w_`cp'_`cpi''"
        fvunab tmp: `tmp'
        scalar w_`cp'_`cpi'="`tmp'"
    }
}

** Remove colinear variables
foreach cp in "c" "p" {
  foreach cpi of numlist 1/4 {
      // The letter and in person subsamples just run for parameters, so skip `cpi' > 1
      if (`cpi' > 1) & (strpos("$output_base", "knock") + strpos("$output_base", "letter") > 0) continue

      local tmp "`=w_`cp'_`cpi''"
      fvunab tmp: `tmp'
      scalar w_`cp'_`cpi'="`tmp'"

      scalar keep_w_`cp'_`cpi'=""
      local all_vars `=w_`cp'_`cpi''
      local num_tot_words: word count `=w_`cp'_`cpi''
      local num_drop_words 0
      local num_keep_words 0

      noi di "Dropping: ", _c
      quietly foreach v of local all_vars {
          reg `v' `=keep_w_`cp'_`cpi''
          if e(r2) > .99 {
              noi di "`v'", _c
              local ++num_drop_words
              continue
          }
          scalar keep_w_`cp'_`cpi'="`=keep_w_`cp'_`cpi'' `v'"
          local ++num_keep_words
          /* Below tracks progress with output every 100 vars */
          //if mod(`num_keep_words'+`num_drop_words', 100) == 0 noi di "`cp'-`cpi'(+/-/=): `num_keep_words'/`num_drop_words'/`num_tot_words' (`=round(100*`num_keep_words'/`num_tot_words')' pct)"
      }
      noi di
      scalar w_`cp'_`cpi'="`=keep_w_`cp'_`cpi''"
      /*
      We did this in a jupyter notebook Stata kernel, and used the command:
      !echo "    scalar w_`cp'_`cpi'="`=keep_w_`cp'_`cpi''"" >> "2.4_set_bootstrap_w_scalars.do"
      to get the output into the stata .do file.
      If running this from stata, unfortunately the only solution seems to be copying
      the output made from the command below manually into the 2.4 do file.
      */
      noi di "Keeping `cp'-`cpi': `=w_`cp'_`cpi''"
  } // end 1/4 numlist
}


** Run bootstrap routine
quietly ///
foreach cp in "c" "p" {
    foreach cpi of numlist 1/4 {
      // The letter and in person subsamples just run for parameters, so skip `cpi' > 1
      if (`cpi' > 1) & (strpos("$output_base", "knock") + strpos("$output_base", "letter") > 0) continue

        global res_out "${output_base}`cp'-`cpi'"
        noi di "Outputting to $res_out"

        // Load the data
        use "${dir_data}/lasso_table_data_interactions.dta", clear
        cap gen DI = cia_pos // Infectious
        cap gen Dtau = tested_yn // Tested
        keep DI Dtau `=w_`cp'_`cpi''

        /*
        START INNER ROUTINE
        */
        noisily display "Running `cp' loop `cpi'...", _c
        quietly {
            global W_variables "`=w_`cp'_`cpi''"
            global W_plus="$W_variables 1"


            foreach wvar in $W_plus {
              gen W_`wvar' = `wvar' * 1
              gen DI_W_`wvar' = DI * W_`wvar'
              gen Dtau_W_`wvar' = Dtau * W_`wvar'
              gen DI_Dtau_W_`wvar' = DI *Dtau * W_`wvar'
            }


            *------------------------------------------------------------------------------*
            *   Step 1: Construct estimate of theta - residual vector
            *------------------------------------------------------------------------------*
            rlasso DI $W_variables
              mat beta_hat = e(b)

               quietly { // calculate theta for base lasso
                // Define F[Beta] for beta_tilde
                  sum DI
                scalar cdf_offset = r(mean)

                foreach wvar in $W_plus {
                  gen F_beta_hat_`wvar' = normal(`=-cdf_offset/cdf_h') // equivalent to 0
                }
                  local i = 0
                foreach col in `:colnames beta_hat' {
                  local ++i
                  if "`col'" == "_cons" local col = 1
                  replace F_beta_hat_`col' = normal((`col' * beta_hat[1,`i'] - `=cdf_offset')/`=cdf_h')
                }

                foreach thetavar in DI Dtau {
                    sum `thetavar'
                    scalar `thetavar'_bar = r(mean)
                }

                  foreach thetavar in W DI_W Dtau_W DI_Dtau_W {
                    scalar `thetavar'_FB_hat = 0
                    foreach wvar in $W_plus {
                      sum `thetavar'_`wvar'
                      scalar `thetavar'_`wvar'_bar = r(mean)
                      scalar `thetavar'_FB_hat = `=`thetavar'_FB_hat + r(mean) * F_beta_hat_`wvar''
                    }
                  }

                  scalar lambda_hat = (`=DI_W_FB_hat' * (1-`=DI_bar')) / (`=DI_bar' * (`=W_FB_hat' - `=DI_W_FB_hat'))
                  display `"    scalar lambda = (`=DI_W_FB_hat' * (1-`=DI_bar')) / (`=DI_bar' * (`=W_FB_hat' - `=DI_W_FB_hat')) = `=lambda_hat'"'
                  scalar ua_hat = `=Dtau_W_FB_hat' / `=Dtau_bar'
                  display `"    scalar ua = `=Dtau_W_FB_hat' / `=Dtau_bar' = `=ua_hat'"'
                  scalar ub_hat = `=DI_Dtau_W_FB_hat' * `=W_FB_hat' / (`=DI_W_FB_hat' * `=Dtau_W_FB_hat')
                  display `"    scalar ub = `=DI_Dtau_W_FB_hat' * `=W_FB_hat' / (`=DI_W_FB_hat' * `=Dtau_W_FB_hat') = `=ub_hat'"'
                  scalar alpha_1_hat = (`=DI_W_FB_hat' * (1-`=DI_bar'))
                  display `"    scalar alpha_1_hat = (`=DI_W_FB_hat' * (1-`=DI_bar')) = `=alpha_1_hat'"'
                  scalar alpha_0_hat = (`=DI_bar' * (`=W_FB_hat' - `=DI_W_FB_hat'))
                  display `"    scalar alpha_0_hat = (`=DI_bar' * (`=W_FB_hat' - `=DI_W_FB_hat')) = `=alpha_0_hat'"'
                } // end quietly calculate theta for base lasso

            *------------------------------------------------------------------------------*
            *   Step 2: Zero out loW beta values & generate epsilon tilde matrix
            *------------------------------------------------------------------------------*

            mat beta_tilde = [beta_hat]
            foreach i of numlist 1/`=colsof(beta_tilde)' {
              if beta_tilde[1,`i'] < = `=a_n' mat beta_tilde[1,`i'] = 0
            }


            // Define F[Beta] for beta_tilde
              sum DI
            scalar cdf_offset = r(mean)

            foreach wvar in $W_plus {
              gen F_beta_`wvar' = normal(`=-cdf_offset/cdf_h') // equivalent to 0
            }


            cap drop xbetatilde
            gen xbetatilde = 0
            local i = 0
            foreach col in `:colnames beta_tilde' {
              local ++i
              if "`col'" == "_cons" local col = 1
              else replace F_beta_`col' = normal((`col' * beta_tilde[1,`i'] - `=cdf_offset')/`=cdf_h')

              replace xbetatilde = xbetatilde + `col' * beta_tilde[1,`i']
            }

            quietly { // Form XB through DIDtauW for epsilon_tilde
                cap drop etilde_xbetatilde
              gen etilde_xbetatilde = DI - xbetatilde
                sum etilde_xbetatilde
                replace etilde_xbetatilde = etilde_xbetatilde - r(mean)

              foreach thetavar in DI Dtau {
                cap drop etilde_`thetavar'
                sum `thetavar'
                gen etilde_`thetavar' = `thetavar' - r(mean)
                scalar `thetavar'_bar = r(mean)
              }

              foreach thetavar in W DI_W Dtau_W DI_Dtau_W {
                scalar `thetavar'_FB = 0
                foreach wvar in $W_plus {
                  sum `thetavar'_`wvar'
                  gen etilde_`thetavar'_`wvar' = `thetavar'_`wvar' - r(mean)
                  scalar `thetavar'_`wvar'_bar = r(mean)
                  scalar `thetavar'_FB = `=`thetavar'_FB + r(mean) * F_beta_`wvar''
                }
              }

              scalar lambda_tilde = (`=DI_W_FB' * (1-`=DI_bar')) / (`=DI_bar' * (`=W_FB' - `=DI_W_FB'))
              scalar ua_tilde = `=Dtau_W_FB' / `=Dtau_bar'
              scalar ub_tilde = `=DI_Dtau_W_FB' * `=W_FB' / (`=DI_W_FB' * `=Dtau_W_FB')
              scalar alpha_1_tilde = (`=DI_W_FB' * (1-`=DI_bar'))
              scalar alpha_0_tilde = (`=DI_bar' * (`=W_FB' - `=DI_W_FB'))
            } // end quietly Form XB through DIDtauW for epsilon_tilde

            *------------------------------------------------------------------------------*
            *   Step 3: Sample epislon tilde matrix With replacement --> epsilon star
            *------------------------------------------------------------------------------*
            display "Iteration ", _c
            quietly foreach b of numlist 1/$B {
              noi display "`b' ", _c
              preserve
              bsample // sample all N with replacement

              *------------------------------------------------------------------------------*
              *   Step 4: Simulate new data with expected value plus epislon star
              *------------------------------------------------------------------------------*
              quietly { // Form star version of XB through DIDtauW_*
                gen xbetatilde_star = xbetatilde + etilde_xbetatilde

                foreach thetavar in DI Dtau {
                  gen `thetavar'_star = `thetavar'_bar + etilde_`thetavar'
                }

                foreach thetavar in W DI_W Dtau_W DI_Dtau_W {
                  foreach wvar in $W_plus {
                    gen `thetavar'_`wvar'_star = `thetavar'_`wvar'_bar + etilde_`thetavar'_`wvar'
                  }
                }
              } // end quietly

              *------------------------------------------------------------------------------*
              *   Step 5: Estimate new lasso parameters --> theta star vector
              *------------------------------------------------------------------------------*
                sum DI_star
              scalar cdf_offset_star = r(mean)

              rlasso xbetatilde_star $W_variables

              quietly { // generate theta*
                // Predict new beta for new FB
                predict xbeta_star
                mat beta_star = e(b)

                local i = 0
                foreach wvar in $W_plus {
                  gen F_beta_star_`wvar' = normal(`=-cdf_offset_star/cdf_h') // equivalent to 0
                }
                foreach col in `:colnames beta_star' { // overwrite F_beta_star with non-zero lasso coefficients
                  local ++i
                  if "`col'" == "_cons" local col 1
                  replace F_beta_star_`col' = normal((`col' * beta_star[1,`i'] - `=cdf_offset_star')/`=cdf_h')
                }

                // make theta* vars for G*
                foreach thetavar in DI Dtau {
                  sum `thetavar'_star
                  scalar `thetavar'_star_bar = r(mean)
                }
                foreach thetavar in W DI_W Dtau_W DI_Dtau_W {
                  scalar `thetavar'_FB_star = 0
                  foreach wvar in $W_plus {
                    sum `thetavar'_`wvar'
                    scalar `thetavar'_`wvar'_star_bar = r(mean)
                    scalar `thetavar'_FB_star = `=`thetavar'_FB_star + r(mean) * F_beta_star_`wvar''
                  }
                }
              } // end quietly generate theta star
              *------------------------------------------------------------------------------*
              *   Step 6: Loop 3-5 for B iterations
              *------------------------------------------------------------------------------*

              scalar lambda_star = (`=DI_W_FB_star' * (1-`=DI_star_bar')) / (`=DI_star_bar' * (`=W_FB_star' - `=DI_W_FB_star'))
              scalar ua_star = `=Dtau_W_FB_star' / `=Dtau_star_bar'
              scalar ub_star = `=DI_Dtau_W_FB_star' * `=W_FB_star' / (`=DI_W_FB_star' * `=Dtau_W_FB_star')
              scalar alpha_1_star = (`=DI_W_FB_star' * (1-`=DI_star_bar'))
              scalar alpha_0_star = (`=DI_star_bar' * (`=W_FB_star' - `=DI_W_FB_star'))

              mat T_star_i = (`=(lambda_star - lambda_tilde)', `=(ua_star - ua_tilde)', `=(ub_star - ub_tilde)', `=(alpha_1_star - alpha_1_tilde)', `=(alpha_0_star - alpha_0_tilde)')
              mat params_i = (`=lambda_star', `=ua_star', `=ub_star', `=alpha_1_star', `=alpha_0_star')

              if `b' == 1 {
                mat T_star = T_star_i
                mat colnames T_star = "lambda" "u_a" "u_b" "alpha_1" "alpha_0"
                mat params = params_i
                mat colnames params = "lambda" "u_a" "u_b" "alpha_1" "alpha_0"
              }
              else {
                mat T_star = (T_star \ T_star_i)
                mat params = (params \ params_i)
              }

            restore
            }
        }
        noisily display " done!"
        /*
        END INNER ROUTINE
        */


        *------------------------------------------------------------------------------*
        *   Step 7: Compute standard error
        *------------------------------------------------------------------------------*
        svmat T_star, names(matcol)
        svmat params, names(matcol)

        foreach v in `:colnames T_star' {
          sum T_star`v'
          scalar T_`v'_se_`cp'`cpi' = r(sd)
          sum params`v'
          scalar param_`v'_se_`cp'`cpi' = r(sd)
          noisily display "    `v': `=T_`v'_se_`cp'`cpi'', `=param_`v'_se_`cp'`cpi''"
        }


        *------------------------------------------------------------------------------*
        *   Step 8: Compute CI
        *------------------------------------------------------------------------------*
        foreach v in `:colnames T_star' {
          sum T_star`v'
          scalar T_`v'_bar_`cp'`cpi' = r(mean)
          _pctile T_star`v', p(5 95)
          scalar T_`v'_ci_5_`cp'`cpi' = -r(r1) + `=T_`v'_bar_`cp'`cpi''
          scalar T_`v'_ci_95_`cp'`cpi' = r(r2) - `=T_`v'_bar_`cp'`cpi''
          noisily display "    `v': [`=T_`v'_ci_5_`cp'`cpi'' < `=T_`v'_bar_`cp'`cpi'' > `=T_`v'_ci_95_`cp'`cpi'']"
        }

        keep T_star* params*
        noisily display `"   Saving to "${res_out}.dta""'
        if $writeout save "${res_out}.dta", replace
    } // end cpi
} // end cp
